<!DOCTYPE html>
<html class="writer-html5" lang="en" >
<head>
  <meta charset="utf-8" />
  <meta name="viewport" content="width=device-width, initial-scale=1.0" />
  <title>Achieving high performance &mdash; cuFFTDx 1.0.0 documentation</title>
      <link rel="stylesheet" href="_static/pygments.css" type="text/css" />
      <link rel="stylesheet" href="_static/css/theme.css" type="text/css" />
      <link rel="stylesheet" href="_static/cufftdx_override.css" type="text/css" />
  <!--[if lt IE 9]>
    <script src="_static/js/html5shiv.min.js"></script>
  <![endif]-->
  
        <script data-url_root="./" id="documentation_options" src="_static/documentation_options.js"></script>
        <script src="_static/jquery.js"></script>
        <script src="_static/underscore.js"></script>
        <script src="_static/doctools.js"></script>
    <script src="_static/js/theme.js"></script>
    <link rel="index" title="Index" href="genindex.html" />
    <link rel="search" title="Search" href="search.html" />
    <link rel="next" title="Requirements and Functionality" href="requirements_func.html" />
    <link rel="prev" title="First FFT using cuFFTDx" href="introduction.html" /> 
</head>

<body class="wy-body-for-nav"> 
  <div class="wy-grid-for-nav">
    <nav data-toggle="wy-nav-shift" class="wy-nav-side">
      <div class="wy-side-scroll">
        <div class="wy-side-nav-search" > 
            <a href="index.html" class="icon icon-home"> cuFFTDx
          </a>
              <div class="version">
                1.0.0
              </div>
<div role="search">
  <form id="rtd-search-form" class="wy-form" action="search.html" method="get">
    <input type="text" name="q" placeholder="Search docs" />
    <input type="hidden" name="check_keywords" value="yes" />
    <input type="hidden" name="area" value="default" />
  </form>
</div>

  <style>
    /* Sidebar header (and topbar for mobile) */
    .wy-side-nav-search, .wy-nav-top {
      background: #76b900;
    }

    .wy-side-nav-search a:link, .wy-nav-top a:link {
      color: #fff;
    }
    .wy-side-nav-search a:visited, .wy-nav-top a:visited {
      color: #fff;
    }
    .wy-side-nav-search a:hover, .wy-nav-top a:hover {
      color: #fff;
    }

    .wy-menu-vertical a:link, .wy-menu-vertical a:visited {
      color: #d9d9d9
    }

    .wy-menu-vertical a:active {
      background-color: #76b900
    }

    .wy-side-nav-search>div.version {
      color: rgba(0, 0, 0, 0.3)
    }

    /* override table width restrictions */
    .wy-table-responsive table td, .wy-table-responsive table th {
        white-space: normal;
    }

    .wy-table-responsive {
        margin-bottom: 24px;
        max-width: 100%;
        overflow: visible;
    }
  </style>
  
        </div><div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="Navigation menu">
              <ul>
<li class="toctree-l1"><a class="reference internal" href="index.html">Documentation home</a></li>
</ul>
<p class="caption" role="heading"><span class="caption-text">User guide:</span></p>
<ul class="current">
<li class="toctree-l1"><a class="reference internal" href="introduction.html">First FFT using cuFFTDx</a><ul>
<li class="toctree-l2"><a class="reference internal" href="introduction.html#what-next">What next?</a></li>
<li class="toctree-l2"><a class="reference internal" href="introduction.html#compilation">Compilation</a></li>
</ul>
</li>
<li class="toctree-l1"><a class="reference internal" href="introduction.html#your-next-custom-fft-kernels">Your next custom FFT kernels</a><ul>
<li class="toctree-l2"><a class="reference internal" href="introduction.html#what-happens-under-the-hood">What happens under the hood?</a></li>
<li class="toctree-l2"><a class="reference internal" href="introduction.html#why">Why?</a></li>
</ul>
</li>
<li class="toctree-l1 current"><a class="current reference internal" href="#">Achieving high performance</a><ul>
<li class="toctree-l2"><a class="reference internal" href="#general-advice">General advice</a></li>
<li class="toctree-l2"><a class="reference internal" href="#memory-management">Memory management</a></li>
<li class="toctree-l2"><a class="reference internal" href="#kernel-fusion">Kernel fusion</a></li>
<li class="toctree-l2"><a class="reference internal" href="#advanced">Advanced</a></li>
<li class="toctree-l2"><a class="reference internal" href="#further-reading">Further reading</a><ul>
<li class="toctree-l3"><a class="reference internal" href="#references">References</a></li>
</ul>
</li>
</ul>
</li>
<li class="toctree-l1"><a class="reference internal" href="requirements_func.html">Requirements and Functionality</a><ul>
<li class="toctree-l2"><a class="reference internal" href="requirements_func.html#requirements">Requirements</a><ul>
<li class="toctree-l3"><a class="reference internal" href="requirements_func.html#supported-compilers">Supported Compilers</a></li>
</ul>
</li>
<li class="toctree-l2"><a class="reference internal" href="requirements_func.html#supported-functionality">Supported Functionality</a></li>
</ul>
</li>
<li class="toctree-l1"><a class="reference internal" href="api/index.html">cuFFTDx API Reference</a><ul>
<li class="toctree-l2"><a class="reference internal" href="api/operators.html">Operators</a><ul>
<li class="toctree-l3"><a class="reference internal" href="api/operators.html#description-operators">Description Operators</a><ul>
<li class="toctree-l4"><a class="reference internal" href="api/operators.html#size-operator">Size Operator</a></li>
<li class="toctree-l4"><a class="reference internal" href="api/operators.html#direction-operator">Direction Operator</a></li>
<li class="toctree-l4"><a class="reference internal" href="api/operators.html#type-operator">Type Operator</a></li>
<li class="toctree-l4"><a class="reference internal" href="api/operators.html#precision-operator">Precision Operator</a></li>
<li class="toctree-l4"><a class="reference internal" href="api/operators.html#sm-operator">SM Operator</a></li>
</ul>
</li>
<li class="toctree-l3"><a class="reference internal" href="api/operators.html#execution-operators">Execution Operators</a><ul>
<li class="toctree-l4"><a class="reference internal" href="api/operators.html#thread-operator">Thread Operator</a></li>
<li class="toctree-l4"><a class="reference internal" href="api/operators.html#block-operator">Block Operator</a></li>
<li class="toctree-l4"><a class="reference internal" href="api/operators.html#block-configuration-operators">Block Configuration Operators</a></li>
</ul>
</li>
</ul>
</li>
<li class="toctree-l2"><a class="reference internal" href="api/traits.html">Traits</a><ul>
<li class="toctree-l3"><a class="reference internal" href="api/traits.html#description-traits">Description Traits</a><ul>
<li class="toctree-l4"><a class="reference internal" href="api/traits.html#size-trait">Size Trait</a></li>
<li class="toctree-l4"><a class="reference internal" href="api/traits.html#type-trait">Type Trait</a></li>
<li class="toctree-l4"><a class="reference internal" href="api/traits.html#direction-trait">Direction Trait</a></li>
<li class="toctree-l4"><a class="reference internal" href="api/traits.html#precision-trait">Precision Trait</a></li>
<li class="toctree-l4"><a class="reference internal" href="api/traits.html#is-fft-trait">Is FFT? Trait</a></li>
<li class="toctree-l4"><a class="reference internal" href="api/traits.html#is-fft-execution-trait">Is FFT Execution? Trait</a></li>
<li class="toctree-l4"><a class="reference internal" href="api/traits.html#is-fft-complete-trait">Is FFT-complete? Trait</a></li>
<li class="toctree-l4"><a class="reference internal" href="api/traits.html#is-fft-complete-execution-trait">Is FFT-complete Execution? Trait</a></li>
</ul>
</li>
<li class="toctree-l3"><a class="reference internal" href="api/traits.html#execution-traits">Execution Traits</a><ul>
<li class="toctree-l4"><a class="reference internal" href="api/traits.html#thread-traits">Thread Traits</a></li>
<li class="toctree-l4"><a class="reference internal" href="api/traits.html#block-traits">Block Traits</a></li>
</ul>
</li>
</ul>
</li>
<li class="toctree-l2"><a class="reference internal" href="api/methods.html">Execution Methods</a><ul>
<li class="toctree-l3"><a class="reference internal" href="api/methods.html#thread-execute-method">Thread Execute Method</a></li>
<li class="toctree-l3"><a class="reference internal" href="api/methods.html#block-execute-method">Block Execute Method</a><ul>
<li class="toctree-l4"><a class="reference internal" href="api/methods.html#value-format">Value Format</a></li>
<li class="toctree-l4"><a class="reference internal" href="api/methods.html#input-output-data-format">Input/Output Data Format</a></li>
</ul>
</li>
<li class="toctree-l3"><a class="reference internal" href="api/methods.html#make-workspace-function">Make Workspace Function</a></li>
</ul>
</li>
</ul>
</li>
<li class="toctree-l1"><a class="reference internal" href="release_notes.html">Release Notes</a><ul>
<li class="toctree-l2"><a class="reference internal" href="release_notes.html#id1">1.0.0</a><ul>
<li class="toctree-l3"><a class="reference internal" href="release_notes.html#new-features">New Features</a></li>
<li class="toctree-l3"><a class="reference internal" href="release_notes.html#resolved-issues">Resolved Issues</a></li>
</ul>
</li>
<li class="toctree-l2"><a class="reference internal" href="release_notes.html#id2">0.3.1</a><ul>
<li class="toctree-l3"><a class="reference internal" href="release_notes.html#known-issues">Known Issues</a></li>
</ul>
</li>
</ul>
</li>
</ul>

        </div>
      </div>
    </nav>

    <section data-toggle="wy-nav-shift" class="wy-nav-content-wrap"><nav class="wy-nav-top" aria-label="Mobile navigation menu" >
          <i data-toggle="wy-nav-top" class="fa fa-bars"></i>
          <a href="index.html">cuFFTDx</a>
      </nav>

      <div class="wy-nav-content">
        <div class="rst-content">
          <div role="navigation" aria-label="Page navigation">
  <ul class="wy-breadcrumbs">
      <li><a href="index.html" class="icon icon-home"></a> &raquo;</li>
      <li>Achieving high performance</li>
      <li class="wy-breadcrumbs-aside">
      </li>
  </ul>
  <hr/>
</div>
          <div role="main" class="document" itemscope="itemscope" itemtype="http://schema.org/Article">
           <div itemprop="articleBody">
             
  <div class="section" id="achieving-high-performance">
<h1>Achieving high performance<a class="headerlink" href="#achieving-high-performance" title="Permalink to this headline">¶</a></h1>
<p>In High-Performance Computing, the ability to write customized code enables
users to target better performance. In the case of cuFFTDx, the potential for
performance improvement of existing FFT applications is high, but it greatly
depends on how the library is used. Taking the regular cuFFT library as
baseline, the performance may be up to one order of magnitude better or worse.
For this reason porting existing sources to cuFFTDx should always be done in
parallel with performance analysis. Below we list general advice that
may help in this process.</p>
<div class="section" id="general-advice">
<h2>General advice<a class="headerlink" href="#general-advice" title="Permalink to this headline">¶</a></h2>
<ul class="simple">
<li><p>Try library-provided default settings to start with best compute performance</p></li>
<li><p>Best parameters for compute bound and memory bound kernels might not be identical</p></li>
<li><p>Ensure FFT kernel runs enough blocks to fill a GPU for peak performance</p></li>
<li><p>Merge adjacent memory bound kernels (pre- and post-processing) with an FFT kernel to save global memory trips</p></li>
</ul>
</div>
<div class="section" id="memory-management">
<h2>Memory management<a class="headerlink" href="#memory-management" title="Permalink to this headline">¶</a></h2>
<ul class="simple">
<li><p>Avoid reading/writing data from global memory</p></li>
<li><p>Ensure global memory reads/writes are coalesced (increase the value of <a class="reference internal" href="api/operators.html#fftsperblock-operator-label"><span class="std std-ref">FFTs Per Block Operator</span></a> if needed)</p></li>
<li><p>Use <code class="xref cpp cpp-any docutils literal notranslate"><span class="pre">shared</span></code> memory or extra registers to store the temporary data</p></li>
</ul>
</div>
<div class="section" id="kernel-fusion">
<h2>Kernel fusion<a class="headerlink" href="#kernel-fusion" title="Permalink to this headline">¶</a></h2>
<ul class="simple">
<li><p>For complex kernels consider adjusting FFT operation to match user kernel
(ie. tweaking <a class="reference internal" href="api/operators.html#ept-operator-label"><span class="std std-ref">Elements Per Thread Operator</span></a> will change required CUDA block size). Upcoming versions of
cuFFTDx will offer more customization options.</p></li>
<li><p>For simple operations consider merging operations into FFT kernel optimized
for FFT performance.</p></li>
</ul>
</div>
<div class="section" id="advanced">
<h2>Advanced<a class="headerlink" href="#advanced" title="Permalink to this headline">¶</a></h2>
<ul class="simple">
<li><p>For FFT loads not filling the GPU entirely consider running parallel kernels in a separate stream</p></li>
<li><p><a class="reference external" href="https://docs.nvidia.com/cuda/cuda-occupancy-calculator/index.html">CUDA Occupancy Calculator</a> <a class="footnote-reference brackets" href="#id9" id="id10">5</a> and/or <a class="reference external" href="https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__OCCUPANCY.html">cudaOccupancyMaxActiveBlocksPerMultiprocessor</a> <a class="footnote-reference brackets" href="#id14" id="id15">7</a> function to
determine the optimum launch parameters</p></li>
<li><p>Use the <a class="reference external" href="https://docs.nvidia.com/cuda/cuda-occupancy-calculator/index.html">CUDA Occupancy Calculator</a> <a class="footnote-reference brackets" href="#id9" id="id11">5</a> or <a class="reference external" href="https://docs.nvidia.com/nsight-compute/NsightCompute/index.html#occupancy-calculator">Nsight Compute</a> <a class="footnote-reference brackets" href="#id12" id="id13">6</a> to determine what extra resources are available without losing occupancy</p></li>
</ul>
</div>
<div class="section" id="further-reading">
<h2>Further reading<a class="headerlink" href="#further-reading" title="Permalink to this headline">¶</a></h2>
<ul class="simple">
<li><p><a class="reference external" href="https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html">CUDA Best Practices Guide</a> <a class="footnote-reference brackets" href="#id1" id="id2">1</a></p></li>
<li><p><a class="reference external" href="https://docs.nvidia.com/cuda/turing-tuning-guide/index.html">Turing Tuning Guide</a> <a class="footnote-reference brackets" href="#id3" id="id4">2</a></p></li>
<li><p><a class="reference external" href="https://docs.nvidia.com/cuda/volta-tuning-guide/index.html">Volta Tuning Guide</a> <a class="footnote-reference brackets" href="#id5" id="id6">3</a></p></li>
<li><p><a class="reference external" href="https://docs.nvidia.com/cuda/ampere-tuning-guide/index.html">Ampere Tuning Guide</a> <a class="footnote-reference brackets" href="#id7" id="id8">4</a></p></li>
</ul>
<div class="section" id="references">
<h3>References<a class="headerlink" href="#references" title="Permalink to this headline">¶</a></h3>
<dl class="footnote brackets">
<dt class="label" id="id1"><span class="brackets"><a class="fn-backref" href="#id2">1</a></span></dt>
<dd><p><a class="reference external" href="https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html">https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html</a></p>
</dd>
<dt class="label" id="id3"><span class="brackets"><a class="fn-backref" href="#id4">2</a></span></dt>
<dd><p><a class="reference external" href="https://docs.nvidia.com/cuda/turing-tuning-guide/index.html">https://docs.nvidia.com/cuda/turing-tuning-guide/index.html</a></p>
</dd>
<dt class="label" id="id5"><span class="brackets"><a class="fn-backref" href="#id6">3</a></span></dt>
<dd><p><a class="reference external" href="https://docs.nvidia.com/cuda/volta-tuning-guide/index.html">https://docs.nvidia.com/cuda/volta-tuning-guide/index.html</a></p>
</dd>
<dt class="label" id="id7"><span class="brackets"><a class="fn-backref" href="#id8">4</a></span></dt>
<dd><p><a class="reference external" href="https://docs.nvidia.com/cuda/ampere-tuning-guide/index.html">https://docs.nvidia.com/cuda/ampere-tuning-guide/index.html</a></p>
</dd>
<dt class="label" id="id9"><span class="brackets">5</span><span class="fn-backref">(<a href="#id10">1</a>,<a href="#id11">2</a>)</span></dt>
<dd><p><a class="reference external" href="https://docs.nvidia.com/cuda/cuda-occupancy-calculator/index.html">https://docs.nvidia.com/cuda/cuda-occupancy-calculator/index.html</a></p>
</dd>
<dt class="label" id="id12"><span class="brackets"><a class="fn-backref" href="#id13">6</a></span></dt>
<dd><p><a class="reference external" href="https://docs.nvidia.com/nsight-compute/NsightCompute/index.html#occupancy-calculator">https://docs.nvidia.com/nsight-compute/NsightCompute/index.html#occupancy-calculator</a></p>
</dd>
<dt class="label" id="id14"><span class="brackets"><a class="fn-backref" href="#id15">7</a></span></dt>
<dd><p><a class="reference external" href="https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__OCCUPANCY.html">https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__OCCUPANCY.html</a></p>
</dd>
</dl>
</div>
</div>
</div>


           </div>
          </div>
          <footer><div class="rst-footer-buttons" role="navigation" aria-label="Footer">
        <a href="introduction.html" class="btn btn-neutral float-left" title="First FFT using cuFFTDx" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left" aria-hidden="true"></span> Previous</a>
        <a href="requirements_func.html" class="btn btn-neutral float-right" title="Requirements and Functionality" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right" aria-hidden="true"></span></a>
    </div>

  <hr/>

  <div role="contentinfo">
    <p>&#169; Copyright 2022, NVIDIA Corporation.</p>
  </div>

  Built with <a href="https://www.sphinx-doc.org/">Sphinx</a> using a
    <a href="https://github.com/readthedocs/sphinx_rtd_theme">theme</a>
    provided by <a href="https://readthedocs.org">Read the Docs</a>.
   

</footer>
        </div>
      </div>
    </section>
  </div>
  <script>
      jQuery(function () {
          SphinxRtdTheme.Navigation.enable(true);
      });
  </script>  

  <style>
  a:link, a:visited {
    color: #76b900;
  }

  a:hover {
    color: #8c0;
  }

  .rst-content dl:not(.docutils) dt {
    background: rgba(118, 185, 0, 0.1);
    color: rgba(59,93,0,1);
    border-top: solid 3px rgba(59,93,0,1);
  }
  </style>
  

</body>
</html>